Create the design matrix
## Step 1: filter out genes with all-zero counts
# cts_filtered <- cts[rowSums(cts) > 0, ]
# keep <- rowSums(cpm(cts) > 1) >= 5
min_samples <- ceiling(0.2 * nrow(meta_data)) # about 25 samples
keep <- rowSums(cpm(cts) > 1) >= min_samples
cts_filtered <- cts[keep, ]
dim(cts_filtered)
#> [1] 15744 126
### Step 2: Create a DGEList object
dge <- DGEList(counts = cts_filtered)
# Optional: TMM normalization (especially if library sizes vary)
dge <- calcNormFactors(dge)
### Step 3: Create the design matrix
design <- model.matrix(~ 0 + met_site_group, data = meta_data)
colnames(design) <- levels(meta_data$met_site_group)
### Step 4: Create contrast matrix
contrast.matrix <- makeContrasts(
## primary as normative
skin_vs_primary = skin - primary,
lung_vs_primary = lung - primary,
liver_vs_primary = liver - primary,
lymph_node_vs_primary = lymph_node - primary,
pleural_effusion_vs_primary = pleural_effusion - primary,
central_nervous_system_vs_primary = central_nervous_system - primary,
ascites_vs_primary = ascites - primary,
small_intestine_vs_primary = small_intestine - primary,
## skin as normative
# skin_vs_lung = lung - skin,
# skin_vs_lung = liver - skin,
# skin_vs_lung = lymph_node - skin,
# skin_vs_lung = pleural_effusion - skin,
# ## lymph_node as normative
# skin_vs_lung = lung - skin,
# skin_vs_lung = liver - skin,
# skin_vs_lung = lymph_node - skin,
# skin_vs_lung = pleural_effusion - skin,
levels = design)
Annotate DE Results
We annotate the DE genes with Ensembl and Entrez IDs from
Biomart.
getBM(attributes = c("ensembl_gene_id", "entrezgene_id", "external_gene_name"),
mart = useDataset("hsapiens_gene_ensembl", useMart("ensembl"))) %>%
dplyr::rename(ensembl_id = ensembl_gene_id,
entrez_id = entrezgene_id,
gene_name = external_gene_name) %>%
dplyr::filter(stringr::str_length(gene_name) > 1,
!duplicated(ensembl_id),
!duplicated(gene_name)) %>%
saveRDS(file = "./data/human_biomart.rds")
human_biomart <- readRDS(file = "./data/human_biomart.rds")
# Extract top differentially expressed genes
topTable(fit2, coef = "skin_vs_primary", adjust.method = "BH",
number = nrow(cts_filtered)) %>%
tibble::rownames_to_column(var = "gene_name") %>%
dplyr::left_join(human_biomart, by = "gene_name") %>%
dplyr::left_join(
cts_filtered %>%
tibble::rownames_to_column(var = "gene_name") %>%
dplyr::rowwise() %>%
dplyr::mutate(
skin_mean_exp = mean(c_across(
meta_data %>%
dplyr::filter(met_site_group == "skin") %>%
tibble::rownames_to_column(var = "depmap_id") %>%
dplyr::pull(depmap_id))),
primary_mean_exp = mean(c_across(
meta_data %>%
dplyr::filter(met_site_group == "primary") %>%
tibble::rownames_to_column(var = "depmap_id") %>%
dplyr::pull(depmap_id)))) %>%
dplyr::ungroup() %>%
dplyr::select(gene_name, contains("exp")),
by = "gene_name") %>%
dplyr::select(gene_name, contains("id"), everything()) -> skin_de
topTable(fit2, coef = "lung_vs_primary", adjust.method = "BH",
number = nrow(cts_filtered)) %>%
tibble::rownames_to_column(var = "gene_name") %>%
dplyr::left_join(human_biomart, by = "gene_name") %>%
dplyr::left_join(
cts_filtered %>%
tibble::rownames_to_column(var = "gene_name") %>%
dplyr::rowwise() %>%
dplyr::mutate(
skin_mean_exp = mean(c_across(
meta_data %>%
dplyr::filter(met_site_group == "lung") %>%
tibble::rownames_to_column(var = "depmap_id") %>%
dplyr::pull(depmap_id))),
primary_mean_exp = mean(c_across(
meta_data %>%
dplyr::filter(met_site_group == "primary") %>%
tibble::rownames_to_column(var = "depmap_id") %>%
dplyr::pull(depmap_id)))) %>%
dplyr::ungroup() %>%
dplyr::select(gene_name, contains("exp")),
by = "gene_name") %>%
dplyr::select(gene_name, contains("id"), everything()) -> lung_de
topTable(fit2, coef = "liver_vs_primary", adjust.method = "BH",
number = nrow(cts_filtered)) %>%
tibble::rownames_to_column(var = "gene_name") %>%
dplyr::left_join(human_biomart, by = "gene_name") %>%
dplyr::left_join(
cts_filtered %>%
tibble::rownames_to_column(var = "gene_name") %>%
dplyr::rowwise() %>%
dplyr::mutate(
skin_mean_exp = mean(c_across(
meta_data %>%
dplyr::filter(met_site_group == "liver") %>%
tibble::rownames_to_column(var = "depmap_id") %>%
dplyr::pull(depmap_id))),
primary_mean_exp = mean(c_across(
meta_data %>%
dplyr::filter(met_site_group == "primary") %>%
tibble::rownames_to_column(var = "depmap_id") %>%
dplyr::pull(depmap_id)))) %>%
dplyr::ungroup() %>%
dplyr::select(gene_name, contains("exp")),
by = "gene_name") %>%
dplyr::select(gene_name, contains("id"), everything()) -> liver_de
topTable(fit2, coef = "lymph_node_vs_primary", adjust.method = "BH",
number = nrow(cts_filtered)) %>%
tibble::rownames_to_column(var = "gene_name") %>%
dplyr::left_join(human_biomart, by = "gene_name") %>%
dplyr::left_join(
cts_filtered %>%
tibble::rownames_to_column(var = "gene_name") %>%
dplyr::rowwise() %>%
dplyr::mutate(
skin_mean_exp = mean(c_across(
meta_data %>%
dplyr::filter(met_site_group == "lymph_node") %>%
tibble::rownames_to_column(var = "depmap_id") %>%
dplyr::pull(depmap_id))),
primary_mean_exp = mean(c_across(
meta_data %>%
dplyr::filter(met_site_group == "primary") %>%
tibble::rownames_to_column(var = "depmap_id") %>%
dplyr::pull(depmap_id)))) %>%
dplyr::ungroup() %>%
dplyr::select(gene_name, contains("exp")),
by = "gene_name") %>%
dplyr::select(gene_name, contains("id"), everything()) -> lymph_de
topTable(fit2, coef = "pleural_effusion_vs_primary", adjust.method = "BH",
number = nrow(cts_filtered)) %>%
tibble::rownames_to_column(var = "gene_name") %>%
dplyr::left_join(human_biomart, by = "gene_name") %>%
dplyr::left_join(
cts_filtered %>%
tibble::rownames_to_column(var = "gene_name") %>%
dplyr::rowwise() %>%
dplyr::mutate(
skin_mean_exp = mean(c_across(
meta_data %>%
dplyr::filter(met_site_group == "pleural_effusion") %>%
tibble::rownames_to_column(var = "depmap_id") %>%
dplyr::pull(depmap_id))),
primary_mean_exp = mean(c_across(
meta_data %>%
dplyr::filter(met_site_group == "primary") %>%
tibble::rownames_to_column(var = "depmap_id") %>%
dplyr::pull(depmap_id)))) %>%
dplyr::ungroup() %>%
dplyr::select(gene_name, contains("exp")),
by = "gene_name") %>%
dplyr::select(gene_name, contains("id"), everything()) -> pe_de
topTable(fit2, coef = "central_nervous_system_vs_primary", adjust.method = "BH",
number = nrow(cts_filtered)) %>%
tibble::rownames_to_column(var = "gene_name") %>%
dplyr::left_join(human_biomart, by = "gene_name") %>%
dplyr::left_join(
cts_filtered %>%
tibble::rownames_to_column(var = "gene_name") %>%
dplyr::rowwise() %>%
dplyr::mutate(
skin_mean_exp = mean(c_across(
meta_data %>%
dplyr::filter(met_site_group == "central_nervous_system") %>%
tibble::rownames_to_column(var = "depmap_id") %>%
dplyr::pull(depmap_id))),
primary_mean_exp = mean(c_across(
meta_data %>%
dplyr::filter(met_site_group == "primary") %>%
tibble::rownames_to_column(var = "depmap_id") %>%
dplyr::pull(depmap_id)))) %>%
dplyr::ungroup() %>%
dplyr::select(gene_name, contains("exp")),
by = "gene_name") %>%
dplyr::select(gene_name, contains("id"), everything()) -> cns_de
topTable(fit2, coef = "ascites_vs_primary", adjust.method = "BH",
number = nrow(cts_filtered)) %>%
tibble::rownames_to_column(var = "gene_name") %>%
dplyr::left_join(human_biomart, by = "gene_name") %>%
dplyr::left_join(
cts_filtered %>%
tibble::rownames_to_column(var = "gene_name") %>%
dplyr::rowwise() %>%
dplyr::mutate(
skin_mean_exp = mean(c_across(
meta_data %>%
dplyr::filter(met_site_group == "ascites") %>%
tibble::rownames_to_column(var = "depmap_id") %>%
dplyr::pull(depmap_id))),
primary_mean_exp = mean(c_across(
meta_data %>%
dplyr::filter(met_site_group == "primary") %>%
tibble::rownames_to_column(var = "depmap_id") %>%
dplyr::pull(depmap_id)))) %>%
dplyr::ungroup() %>%
dplyr::select(gene_name, contains("exp")),
by = "gene_name") %>%
dplyr::select(gene_name, contains("id"), everything()) -> ascite_de
topTable(fit2, coef = "small_intestine_vs_primary", adjust.method = "BH",
number = nrow(cts_filtered)) %>%
tibble::rownames_to_column(var = "gene_name") %>%
dplyr::left_join(human_biomart, by = "gene_name") %>%
dplyr::left_join(
cts_filtered %>%
tibble::rownames_to_column(var = "gene_name") %>%
dplyr::rowwise() %>%
dplyr::mutate(
skin_mean_exp = mean(c_across(
meta_data %>%
dplyr::filter(met_site_group == "small_intestine") %>%
tibble::rownames_to_column(var = "depmap_id") %>%
dplyr::pull(depmap_id))),
primary_mean_exp = mean(c_across(
meta_data %>%
dplyr::filter(met_site_group == "primary") %>%
tibble::rownames_to_column(var = "depmap_id") %>%
dplyr::pull(depmap_id)))) %>%
dplyr::ungroup() %>%
dplyr::select(gene_name, contains("exp")),
by = "gene_name") %>%
dplyr::select(gene_name, contains("id"), everything()) -> si_de
Results
A Volcano plot (log2 fold change vs -log10 p-value) helps identify
genes that display large magnitude changes and high significance.